rm(list=ls())
require(quanteda)
require(stringi)
source("functions.R")

# training corpus
corp <- readRDS("data_corpus_en.RDS") %>% 
    corpus_subset(!dupli & noise < quantile(noise, 0.9, na.rm = TRUE) & n_sent > 1)
    
toks <- tokenize_english(corp) %>% 
        tokens_remove(stopwords_en)
saveRDS(toks, "data_tokens_en.RDS")

# test corpus
load("collocations_en.Rdata")
corp_mnu <- readRDS("data_corpus_manual_en.RDS")
toks_mnu <- tokens(corp_mnu, remove_url = TRUE)
toks_mnu <- tokens_select(toks_mnu, "^[0-9a-zA-Z\\-']+$", valuetype = "regex", 
                          case_insensitive = FALSE,  padding = TRUE)
toks_mnu <- tokens_remove(toks_mnu, stopwords_en, padding = TRUE)
toks_mnu <- tokens_compound(toks_mnu, seqs_cap[seqs_cap$z > 3], 
                            concatenator = ' ', join = TRUE)
saveRDS(toks_mnu, "data_tokens_manual_en.RDS")
